#### Load the data ####

rd_papers <- read.csv("raw-data/rd-papers-meta.csv")

# Ensure only expected imputation values
stopifnot(all(
  sort(unique(rd_papers$imputation)) == c("", "ci90", "ci95", "from_rep_data", "tstat")
))

# Ensure estimates not in need of imputation in fact do not need it
stopifnot(all(!is.na(rd_papers$se_est[rd_papers$imputation == ""])))

# Ensure estimates in need of imputation in fact do need it
stopifnot(all(is.na(rd_papers$se_est[rd_papers$imputation != ""])))

# Function to do imputation
do_imputation <- function(df, type, impute_func) {
  df_copy <- df
  idx <- (rd_papers$imputation == type)
  # Check so at least one row is being updated
  stopifnot(any(idx))
  # Check so values to be imputed are missing
  stopifnot(all(is.na(df[idx, "se_est"])))
  imputed_se <- impute_func(df[idx, ])
  # Check so imputation function return correct number of values
  stopifnot(length(imputed_se) == sum(idx)) 
  df[idx, "se_est"] <- imputed_se
  # Check so no unintended edits were made
  stopifnot(identical(df[!idx, ], df_copy[!idx, ]))
  df
}

# Impute SE from reported t-stat
rd_papers <- do_imputation(
  rd_papers,
  "tstat",
  function(df) {
    stopifnot(all(!is.na(df$beta_est)))
    stopifnot(all(!is.na(df$tstat)))
    df$beta_est / df$tstat
  }
)

# Impute SE from reported 90% CI
rd_papers <- do_imputation(
  rd_papers,
  "ci90",
  function(df) {
    stopifnot(all(!is.na(df$ci_90_high)))
    stopifnot(all(!is.na(df$ci_90_low)))
    (df$ci_90_high - df$ci_90_low) / (2 * qnorm(0.95))
  }
)

# Impute SE from reported 95% CI
rd_papers <- do_imputation(
  rd_papers,
  "ci95",
  function(df) {
    stopifnot(all(!is.na(df$ci_95_high)))
    stopifnot(all(!is.na(df$ci_95_low)))
    (df$ci_95_high - df$ci_95_low) / (2 * qnorm(0.975))
  }
)


# Impute SE from replication data

meta_replication <- read.csv("data/meta-replication.csv")

stopifnot(
  all(sort(rd_papers$estimate_code[rd_papers$imputation == "from_rep_data"]) ==
        c("carson_sievert_2017", "caughey_etal_2017", "dbk_warshaw_2016")),
  all(sort(meta_replication$estimate_code) ==
        c("carson_sievert_2017", "caughey_etal_2017", "dbk_warshaw_2016"))
)

for (paper in c("carson_sievert_2017", "caughey_etal_2017", "dbk_warshaw_2016")) {
  pap_idx <- (rd_papers$estimate_code == paper)
  rep_idx <- (meta_replication$estimate_code == paper)
  stopifnot(sum(pap_idx) == 1, sum(rep_idx) == 1)
  for (col in c("beta_est", "se_est", "p_val")) {
    stopifnot(
      is.na(rd_papers[pap_idx, col]) ||
        abs(rd_papers[pap_idx, col] - meta_replication[rep_idx, col]) < 0.01
    )
    rd_papers[pap_idx, col] <- meta_replication[rep_idx, col]
  }
}

# All paper should now have non-missing point est and SE
stopifnot(all(!is.na(rd_papers$beta_est)))
stopifnot(all(!is.na(rd_papers$se_est)))

# Derive t-statistic if missing
tmis <- is.na(rd_papers$tstat)
stopifnot(all(is.na(rd_papers$tstat[tmis])))
rd_papers$tstat[tmis] <- with(rd_papers[tmis, ], abs(beta_est) / se_est)
stopifnot(all(!is.na(rd_papers$tstat)))

# Derive two-sided p-value using normal approx if missing
pmis <- is.na(rd_papers$p_val)
stopifnot(all(is.na(rd_papers$p_val[pmis])))
rd_papers$p_val[pmis] <- with(rd_papers[pmis, ], 2 * (1 - pnorm(tstat)))
stopifnot(all(!is.na(rd_papers$p_val)))


# Check derived vs recorded t-stats
derived_tstat <- abs(rd_papers$beta_est) / rd_papers$se_est
stopifnot(all(abs(derived_tstat - rd_papers$tstat) < 0.001))

# Check derived vs recorded p-values
derived_pvalues <- 2 * (1 - pnorm(rd_papers$tstat))
stopifnot(all(abs(derived_pvalues - rd_papers$p_val) < 0.001))

# Create column indicating year published
rd_papers$year <- as.integer(sapply(strsplit(rd_papers$article_code, "_"), function(x) rev(x)[1]))
stopifnot(all(!is.na(rd_papers$year)))

# Create column with estimate weights for articles with several estimates
stopifnot(!anyNA(rd_papers$disag_estimate))
estimate_count <- ave(rd_papers$disag_estimate, rd_papers$article_code, FUN = sum)
rd_papers$weight <- ifelse(rd_papers$disag_estimate, 1 / estimate_count, NA)

# Save cleaned CSV file
write.csv(rd_papers, "data/cl-rd-papers-meta.csv", row.names = FALSE)
